In [1]:
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import numpy as np
from nbminer.notebook_miner import NotebookMiner

hw_filenames = np.load('../homework_names_jplag_combined_per_student.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:59]] for temp in hw_filenames]

In [2]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.freq_itemsets.frequent_itemsets import FrequentItemsets
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
#a = Features(hw_notebooks[0], 'hw0')
#a.add_notebooks(hw_notebooks[1], 'hw1')
a = Features(hw_notebooks[2], 'hw2')
a.add_notebooks(hw_notebooks[3], 'hw3')
a.add_notebooks(hw_notebooks[4], 'hw4')
a.add_notebooks(hw_notebooks[5], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
agr = ASTGraphReducer(a, threshold=8, split_call=False)
ci = CorpusIdentifier()
fi = FrequentItemsets()
pipe = Pipeline([gastf, rbn, gi, agr, ci, fi])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x7fdb6a3f2cf8>
236
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x7fdb7f236278>
236
<nbminer.preprocess.get_imports.GetImports object at 0x7fdb6a416c18>
236
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x7fdb6a417550>
236
<nbminer.results.prediction.corpus_identifier.CorpusIdentifier object at 0x7fdb6a4175c0>
236
<nbminer.freq_itemsets.frequent_itemsets.FrequentItemsets object at 0x7fdb6a417748>
236
8526
  1%|          | 2/294 [00:00<00:15, 18.88it/s]
0.12496685981750488
100%|██████████| 294/294 [00:12<00:00, 24.03it/s]

In [ ]:


In [3]:
notebook_patterns = {}
for bucket in fi.buckets:
    name = None
    for cell in bucket.items:
        name = cell.get_feature('notebook_name')
    if name not in notebook_patterns:
        notebook_patterns[name] = []
    notebook_patterns[name].append(bucket.get_patterns())

In [4]:
notebook_itemsets = {}
for key in notebook_patterns.keys():
    itemsets = []
    for cell in notebook_patterns[key]:
        itemsets.extend(cell)
    notebook_itemsets[key] = set(itemsets)

In [5]:
keys = [key for key in notebook_itemsets.keys()]
print(len(notebook_itemsets[keys[4]]))


24

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist([len(notebook_itemsets[key]) for key in notebook_itemsets.keys()])


Out[6]:
(array([ 4., 20., 43., 66., 41., 31., 15.,  9.,  5.,  2.]),
 array([ 0.,  7., 14., 21., 28., 35., 42., 49., 56., 63., 70.]),
 <a list of 10 Patch objects>)

In [7]:
total_set = []
for key in notebook_itemsets.keys():
    total_set.extend(notebook_itemsets[key])
print(len(set(total_set)))


231

Adding in the Bottom up features


In [8]:
X, y, names = ci.get_data_set_full()
notebook_templates = {}
notebook_labels = {}
for i, name in enumerate(names):
    notebook_templates[name] = X[i]
    notebook_labels[name] = y[i]

In [9]:
print(len(notebook_labels.keys()))
print(len(notebook_itemsets.keys()))
print(len(notebook_templates.keys()))


236
236
236

In [10]:
notebook_itemsets[None]


Out[10]:
set()

In [11]:
X = []
y = []
for key in notebook_itemsets.keys():
    if key in notebook_itemsets and key is not None:
        itemset_component = ['itemset_' + '_'.join([temp for temp in el]) for el in notebook_itemsets[key]]
        template_component = notebook_templates[key]
        X.append(itemset_component + template_component)
        y.append(notebook_labels[key])

In [12]:
import tqdm
similarities = np.zeros((len(X), len(X)))
for i in tqdm.tqdm(range(len(X))):
    for j in range(len(X)):
        if len(set.union(set(X[i]), set(X[j]))) == 0:
            continue
        similarities[i][j] = len(set.intersection(set(X[i]), set(X[j]))) / (len(set.union(set(X[i]), set(X[j]))))

def get_avg_inter_intra_sims(X, y, val):
    inter_sims = []
    intra_sims = []
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            if y[i] == y[j] and y[i] == val:
                intra_sims.append(similarities[i][j])
            else:
                inter_sims.append(similarities[i][j])
    return np.array(intra_sims), np.array(inter_sims)

for i in np.unique(y):
    intra_sims, inter_sims = get_avg_inter_intra_sims(X, y, i)
    print('Mean intra similarity for hw',i,'is',np.mean(intra_sims),'with std',np.std(intra_sims))
    print('Mean inter similarity for hw',i,'is',np.mean(inter_sims),'with std',np.std(inter_sims))
    print('----')


100%|██████████| 235/235 [00:02<00:00, 88.23it/s]
Mean intra similarity for hw 0 is 0.2907510351440267 with std 0.07579900698652764
Mean inter similarity for hw 0 is 0.25090079814997646 with std 0.07200999591556746
----
Mean intra similarity for hw 1 is 0.26476969204495626 with std 0.07765813554108275
Mean inter similarity for hw 1 is 0.25265215152388015 with std 0.07251359060418164
----
Mean intra similarity for hw 2 is 0.28916346724844183 with std 0.07630218121200974
Mean inter similarity for hw 2 is 0.2510061475398828 with std 0.0720317907724856
----
Mean intra similarity for hw 3 is 0.2998928320685052 with std 0.0740154048013281
Mean inter similarity for hw 3 is 0.2502941578095412 with std 0.07175589126901798
----

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 5, 10
def get_all_sims(X, y, val):
    sims = []
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            if y[i] == val or y[j] == val:
                sims.append(similarities[i][j])
    return sims
fig, axes = plt.subplots(6)
for i in range(6):
    axes[i].hist(get_all_sims(X,y,i), bins=30)



In [14]:
tot = []
for el in X:
    tot.extend(el)
print(len(set(tot)))


370

In [15]:
print(X[0])


['itemset_template_202_template_663', 'itemset_template_109_template_202', 'itemset_template_109_template_202_template_663', 'itemset_template_109_template_184', 'itemset_template_2486', 'itemset_template_109_template_202_template_210', 'itemset_template_1549', 'itemset_template_109_template_184_template_202', 'itemset_template_210', 'itemset_template_202_template_686', 'itemset_template_217', 'itemset_template_107_template_109', 'itemset_template_202_template_613', 'itemset_template_202', 'itemset_template_105_template_109', 'itemset_template_109_template_210', 'itemset_template_105_template_107', 'itemset_template_184_template_210', 'itemset_template_105_template_107_template_109', 'itemset_template_109_template_663', 'itemset_template_1159', 'itemset_template_109', 'itemset_template_202_template_210', 'itemset_template_184_template_202', 'itemset_template_2472', 'template_105', 'template_105', 'template_105', 'template_105', 'template_105', 'template_105', 'template_107', 'template_107', 'template_107', 'template_107', 'template_109', 'template_109', 'template_109', 'template_109', 'template_202', 'template_109', 'template_202', 'template_109', 'template_202', 'template_109', 'template_109', 'template_109', 'template_202', 'template_109', '', 'template_202', 'template_202', 'template_109', 'template_202', 'template_202', 'template_202', 'template_109', 'template_2220', 'template_202', '', 'template_202', 'template_109', 'template_109', 'template_109', 'template_202', 'template_202', 'template_109', 'template_202', 'template_202', 'template_109', 'template_202', 'template_202', 'template_109', 'template_202', 'template_202', 'template_663', 'template_109', 'template_109', 'template_109', 'template_202', 'template_202', 'template_202', 'template_109', 'template_202', 'template_202', 'template_109', 'template_202', 'template_202', 'template_663', 'template_109', 'template_109', 'template_109', 'template_202', 'template_202', 'template_1549', 'template_613', 'template_109', 'template_109', 'template_686', 'template_202', 'template_109', 'template_202', 'template_109', 'template_202', 'template_109', 'template_202', 'template_109', 'template_202', 'template_686', 'template_202', 'template_109', 'template_202', 'template_109', 'template_201', 'template_2486', 'template_2472', 'template_202', 'template_217', 'template_363', 'template_363', 'template_686', 'template_202', 'template_109', 'template_202', 'template_109', 'template_202', 'template_202', '', 'template_109', 'template_109', 'template_217', '', 'template_109', 'template_202', '', 'template_109', 'template_109', 'template_109', 'template_202', 'template_202', 'template_202', 'template_184', 'template_210', '', 'template_109', 'template_109', 'template_109', 'template_109', 'template_1159']

In [16]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join(el) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)

In [17]:
X.shape


Out[17]:
(235, 369)

In [ ]:


In [36]:
p = np.random.permutation(len(X.todense()))
Xt = X.todense()[p][:,:-138]
yt = np.array(y)[p]
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, max_depth=3)
scores = cross_val_score(clf, Xt, yt, cv=10)
print(scores)
print(np.mean(scores))


[0.83333333 0.70833333 0.70833333 0.79166667 0.58333333 0.625
 0.66666667 0.70833333 0.7826087  0.7       ]
0.7107608695652174

In [35]:
from sklearn.ensemble import AdaBoostClassifier
p = np.random.permutation(len(X.todense()))
Xt = X.todense()[p]
yt = np.array(y)[p]
clf = sklearn.ensemble.AdaBoostClassifier(n_estimators=400)
scores = cross_val_score(clf, Xt, yt, cv=10)
print(scores)
print(np.mean(scores))


[0.66666667 0.75       0.83333333 0.58333333 0.625      0.75
 0.95833333 0.75       0.7826087  0.95      ]
0.764927536231884

In [33]:
X[:,-138:][1].todense()


Out[33]:
matrix([[ 5,  9,  0, 70,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  2,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          3,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0, 36,  6,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,
         16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  2,  0,  0,  0,  2,  0,  1,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  3,  2,  0,  0,  0,  0,  0]])

In [19]:
clf.fit(X[:360],y[:360])


Out[19]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
clf.predict(X[300:])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-0ccc389acdaf> in <module>()
----> 1 clf.predict(X[300:])

/dfs/scratch2/fcipollone/test/lib/python3.4/site-packages/sklearn/ensemble/forest.py in predict(self, X)
    536             The predicted classes.
    537         """
--> 538         proba = self.predict_proba(X)
    539 
    540         if self.n_outputs_ == 1:

/dfs/scratch2/fcipollone/test/lib/python3.4/site-packages/sklearn/ensemble/forest.py in predict_proba(self, X)
    576         check_is_fitted(self, 'estimators_')
    577         # Check data
--> 578         X = self._validate_X_predict(X)
    579 
    580         # Assign chunk of trees to jobs

/dfs/scratch2/fcipollone/test/lib/python3.4/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X)
    355                                  "call `fit` before exploiting the model.")
    356 
--> 357         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    358 
    359     @property

/dfs/scratch2/fcipollone/test/lib/python3.4/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
    371         """Validate X whenever one tries to predict, apply, predict_proba"""
    372         if check_input:
--> 373             X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    374             if issparse(X) and (X.indices.dtype != np.intc or
    375                                 X.indptr.dtype != np.intc):

/dfs/scratch2/fcipollone/test/lib/python3.4/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    460                              " minimum of %d is required%s."
    461                              % (n_samples, shape_repr, ensure_min_samples,
--> 462                                 context))
    463 
    464     if ensure_min_features > 0 and array.ndim == 2:

ValueError: Found array with 0 sample(s) (shape=(0, 369)) while a minimum of 1 is required.

In [ ]:
y[300:]

In [ ]:
print(np.sum(clf.predict(X[:360])== y[:360])/len(y[:360]))

In [ ]:


In [ ]: